1 PLN Modeling

Logo

2 Import data, data mangling, scaling

library("dplyr")
library("phyloseq")
library("SNFtool")
library("pheatmap")
library("igraph")
library("tibble")
library("RCy3")
library("igraph")
library(microbiome)
library(arcdiagram)
library(MetaNet)
library(PLNmodels)
library(ggplot2)
library(future)
library(factoextra)
library(tidyr)
plan(multisession, workers = 15)

3 Import Data

3.1 loading

merged_metagenomes <- phyloseq::import_biom("../data/fastq/kraken/bracken//merge_species.biom")
meta <- readxl::read_excel("../data/Glossina_metadata.xlsx")

# Set the new column names in the phyloseq object
sample_names(merged_metagenomes) <- gsub("_.*$|\\.kraken_report_bracken_genuses$", "", sample_names(merged_metagenomes))
sample_names(merged_metagenomes) <- gsub("^Gl", "GI", sample_names(merged_metagenomes))
# Sort the 'meta' data frame by the 'SRA.identifier' column
meta$Sample <- gsub("_.*$", "", meta$Sample)
meta$Samples <- meta$Sample
meta <- meta %>% column_to_rownames(var = "Samples")

# meta$Sample == sample_names(merged_metagenomes)

# Associate the sorted metadata to the phyloseq object as sample data
merged_metagenomes@sam_data <- sample_data(meta)

# Remove the unnecessary 'k_' prefix in the taxonomy data
merged_metagenomes@tax_table@.Data <- substring(merged_metagenomes@tax_table@.Data, 4)

# Rename the columns of the taxonomy table to represent taxonomic ranks
colnames(merged_metagenomes@tax_table@.Data) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")

#Keep only the kingdom of interest
merged_metagenomes <- subset_taxa(merged_metagenomes, Kingdom =="Bacteria")
# head(sample_data(merged_metagenomes))
# head(tax_table(merged_metagenomes))

# Define the list of samples to remove
samples_to_remove <- c("GI-103", "GI-59", "GI-104", "GI-111", "GI-121", 
                       "GI-124", "GI-125", "GI-14", "GI-15", "GI-16", 
                       "GI-17", "GI-25", "GI-26", "GI-27", "GI-28", 
                       "GI-29", "GI-30", "GI-31", "GI-32", "GI-33", 
                       "GI-34", "GI-36", "GI-38", "GI-39", "GI-45", "GI-47", "GI-51", "I9", "I7")

# Prune these samples from your phyloseq object
merged_metagenomes <- prune_samples(!(sample_names(merged_metagenomes) %in% samples_to_remove), merged_metagenomes)


physeq <- microbiome::aggregate_rare(merged_metagenomes, level = "Genus", detection = 0.01/100, prevalence = 5/100)

3.1.1 sanity check

# Identify OTUs with zero counts
empty_otus <- taxa_names(physeq)[taxa_sums(physeq) == 0]

# Print the empty OTU names
if (length(empty_otus) > 0) {
    print(paste("Empty OTUs:", paste(empty_otus, collapse = ", ")))
} else {
    print("No empty OTUs.")
}
## [1] "No empty OTUs."
# Identify samples with zero counts
empty_samples <- sample_names(physeq)[sample_sums(physeq) == 0]

# Print the empty sample names
if (length(empty_samples) > 0) {
    print(paste("Empty samples:", paste(empty_samples, collapse = ", ")))
} else {
    print("No empty samples.")
}
## [1] "No empty samples."
# # Remove samples with no counts
# physeq_F <- prune_samples(sample_sums(physeq) > 0, physeq)
# # Remove OTUs with no counts
# physeq_F <- prune_taxa(taxa_sums(physeq) > 0, physeq)
tax_table_df <- as.data.frame(physeq@tax_table)
otus_to_keep <- rownames(tax_table_df[tax_table_df$Genus != "Other", ])
physeq <- prune_taxa(otus_to_keep, physeq)

3.2 Variable used for the analysis

# Phyloseq object
physeq <- microbiome::aggregate_taxa(physeq, level = "Genus")
# Count data
counts <- otu_table(physeq) %>% as.data.frame() %>% t()
meta <- physeq@sam_data

3.3 Create Female and Male dataset

# Subset phyloseq object for female samples
physeq_F <- subset_samples(physeq, sex == "F")
samples_to_remove <- c("GI-45", "GI-51")
physeq_F <- prune_samples(!(sample_names(physeq_F) %in% samples_to_remove), physeq_F)
# Remove samples with no counts
physeq_F <- prune_samples(sample_sums(physeq_F) > 0, physeq_F)
# Remove OTUs with no counts
physeq_F <- prune_taxa(taxa_sums(physeq_F) > 0, physeq_F)
data_F <- prepare_data(counts = physeq_F@otu_table, covariates = physeq_F@sam_data, offset = "Wrench")


# Subset phyloseq object for female samples
physeq_M <- subset_samples(physeq, sex == "M")
physeq_M <- prune_samples(!(sample_names(physeq_M) %in% samples_to_remove), physeq_M)
data_M <- prepare_data(counts = physeq_M@otu_table, covariates = physeq_M@sam_data, offset = "Wrench")

3.3.1 Sanity check

# Identify OTUs with zero counts
empty_otus <- taxa_names(physeq_F)[taxa_sums(physeq_F) == 0]

# Print the empty OTU names
if (length(empty_otus) > 0) {
    print(paste("Empty OTUs:", paste(empty_otus, collapse = ", ")))
} else {
    print("physeq_F: No empty OTUs.")
}
## [1] "physeq_F: No empty OTUs."
# Identify samples with zero counts
empty_samples <- sample_names(physeq_F)[sample_sums(physeq_F) == 0]

# Print the empty sample names
if (length(empty_samples) > 0) {
    print(paste("Empty samples:", paste(empty_samples, collapse = ", ")))
} else {
    print("physeq_F: No empty samples.")
}
## [1] "physeq_F: No empty samples."
# # Remove samples with no counts
# physeq_F <- prune_samples(sample_sums(physeq_F) > 0, physeq)
# # Remove OTUs with no counts
# physeq_F <- prune_taxa(taxa_sums(physeq_F) > 0, physeq)

# Identify OTUs with zero counts
empty_otus <- taxa_names(physeq_F)[taxa_sums(physeq_M) == 0]

# Print the empty OTU names
if (length(empty_otus) > 0) {
    print(paste("Empty OTUs:", paste(empty_otus, collapse = ", ")))
} else {
    print("physeq_M: No empty OTUs.")
}
## [1] "physeq_M: No empty OTUs."
# Identify samples with zero counts
empty_samples <- sample_names(physeq_F)[sample_sums(physeq_M) == 0]

# Print the empty sample names
if (length(empty_samples) > 0) {
    print(paste("Empty samples:", paste(empty_samples, collapse = ", ")))
} else {
    print("physeq_M: No empty samples.")
}
## [1] "physeq_M: No empty samples."

4 Check the distribution of the data

We have a clear Poisson distribution.

hist(as.matrix(as.data.frame(physeq@otu_table), nclass = 10, main = "Abundance of the OTU data", xlab = "values"))

5 Male and female combined through time

5.1 Poisson lognormal models

The Poisson lognormal model is a statistical model used to describe count data that exhibits overdispersion, which means the variance is greater than the mean. This model combines two components: a Poisson distribution and a lognormal distribution.

By combining the Poisson and lognormal distributions, this model provides a flexible approach to modeling count data that exhibit greater variability than what can be captured by a simple Poisson distribution alone.

5.1.1 Create a PNL suited data frame.

# data PLN object
PLN_data <- PLNmodels::prepare_data(counts = counts, covariates = meta)

5.1.2 Setting up the GLM model

Intercept (~1): - There is a common baseline level of abundance for taxa in the microbiome samples, and zero abundance represents a biological state (not a data artifact), then Abundance ~ 1 + offset(log(Offset)) is appropriate. The intercept allows to model this baseline. - Implicitly assume that all taxa have some baseline level of abundance, even if some taxa may have zero abundance in certain samples. Whith this model, we are looking at how each taxon’s abundance varies around this assumed baseline level.

No Intercept (~0): - Zero abundance represents the true absence of a taxon and there isn’t a common baseline abundance that all taxa share, then Abundance ~ 0 + offset(log(Offset)) might be more appropriate. This formulation explicitly models the starting point as zero. - Baseline Level: This refers to a hypothetical or observed starting point or average level of abundance across all taxa in your microbiome samples. It represents a point of comparison against which you assess changes or differences. - This approach is useful when zero abundance values (where taxa are absent) are not due to technical limitations but rather reflect genuine biological absence. It allows to interpret changes in abundance levels in relation to what you consider to be the typical or expected level of abundance for each taxon.

Offset (offset(log(Offset))): - Purpose: The offset term adjusts for differences in exposure or scaling factors that are known and fixed for each observation - Example: In microbiome studies, offsets are often used to account for differences in sequencing depth or sampling effort across samples. The logarithm transformation (log(Offset)) is common to handle the typical skewness in abundance data.

We can test both models (~1 and ~0) and evaluate their goodness of fit metrics (like AIC or BIC) to determine which one better captures the variability in our data.

Model_01_0 <- PLN(PLN_data$Abundance ~ 0 + offset(log(Offset)), PLN_data)
## 
##  Initialization...
##  Adjusting a full covariance PLN model with nlopt optimizer
##  Post-treatments...
##  DONE!
Model_02_1 <- PLN(PLN_data$Abundance ~ 1 + offset(log(Offset)), PLN_data)
## 
##  Initialization...
##  Adjusting a full covariance PLN model with nlopt optimizer
##  Post-treatments...
##  DONE!
rbind(
  "No Intercept" = Model_01_0$criteria,
  "With Intercept" = Model_02_1$criteria
) %>% knitr::kable()
nb_param loglik BIC ICL
No Intercept 378 -2450.234 -3185.788 -4940.717
With Intercept 405 -2376.152 -3164.245 -4792.277
  • Log-likelihood (loglik): A higher value indicates a better fit to the data.
  • Bayesian Information Criterion (BIC): A lower value indicates a better model, balancing model fit and complexity.
  • Integrated Complete Likelihood (ICL): Similar to BIC, a lower ICL value is preferred, with additional penalization for the complexity of the model.

With Intercept (PLN(PLN_data$Abundance ~ 1 + offset(log(Offset)), PLN_data)) appears to be the better model choice among the two based on the given criteria.

5.2 Model evaluation

Model_01 <- PLN(PLN_data$Abundance ~ 1, PLN_data)
## 
##  Initialization...
##  Adjusting a full covariance PLN model with nlopt optimizer
##  Post-treatments...
##  DONE!
Model_02 <- PLN(PLN_data$Abundance ~ 1 + offset(log(Offset)), PLN_data)
## 
##  Initialization...
##  Adjusting a full covariance PLN model with nlopt optimizer
##  Post-treatments...
##  DONE!
Model_03 <- PLN(PLN_data$Abundance ~ 1 + sex + offset(log(Offset)), PLN_data)
## 
##  Initialization...
##  Adjusting a full covariance PLN model with nlopt optimizer
##  Post-treatments...
##  DONE!
Model_04 <- PLN(PLN_data$Abundance ~ 1 + Ecological.gradient + offset(log(Offset)), PLN_data)
## 
##  Initialization...
##  Adjusting a full covariance PLN model with nlopt optimizer
##  Post-treatments...
##  DONE!
Model_05 <- PLN(PLN_data$Abundance ~ 1 + Caracteristic.Gradient + offset(log(Offset)), PLN_data)
## 
##  Initialization...
##  Adjusting a full covariance PLN model with nlopt optimizer
##  Post-treatments...
##  DONE!
Model_06 <- PLN(PLN_data$Abundance ~ 1 + infection + offset(log(Offset)), PLN_data)
## 
##  Initialization...
##  Adjusting a full covariance PLN model with nlopt optimizer
##  Post-treatments...
##  DONE!
results <- rbind(
  "PLN_data" = Model_01$criteria,
  "PLN_data Offset" = Model_02$criteria,
  "PLN_data sex" = Model_03$criteria,
  "PLN_data Ecological gradient" = Model_04$criteria,
  "PLN_data Caracteristic Gradient" = Model_05$criteria,
  "PLN_data Infection" = Model_06$criteria
)

print(results) %>%
  arrange(loglik) %>%
  knitr::kable()
##                                 nb_param    loglik       BIC       ICL
## PLN_data                             405 -2379.161 -3167.254 -4873.842
## PLN_data Offset                      405 -2376.152 -3164.245 -4792.277
## PLN_data sex                         432 -2368.872 -3209.505 -4827.396
## PLN_data Ecological gradient         486 -2311.695 -3257.407 -4808.464
## PLN_data Caracteristic Gradient      486 -2334.568 -3280.280 -4699.698
## PLN_data Infection                   432 -2365.896 -3206.529 -4759.633
nb_param loglik BIC ICL
PLN_data 405 -2379.161 -3167.254 -4873.842
PLN_data Offset 405 -2376.152 -3164.245 -4792.277
PLN_data sex 432 -2368.872 -3209.505 -4827.396
PLN_data Infection 432 -2365.896 -3206.529 -4759.633
PLN_data Caracteristic Gradient 486 -2334.568 -3280.280 -4699.698
PLN_data Ecological gradient 486 -2311.695 -3257.407 -4808.464

Selected PLN_data Ecological gradient.

5.2.1 GLM-like interface

One can access the fitted value of the counts (Abundance – Y^) and check that the algorithm basically learned correctly from the data:

data.frame(
  fitted   = as.vector(fitted(Model_06)),
  observed = as.vector(PLN_data$Abundance)
) %>%
  ggplot(aes(x = observed, y = fitted)) +
  geom_point(size = .5, alpha = .25) +
  scale_x_log10() +
  scale_y_log10() +
  theme_bw() +
  annotation_logticks()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

The residual correlation matrix better displays as an image matrix:

Model_06 %>%
  sigma() %>%
  cov2cor() %>%
  corrplot::corrplot(, tl.cex = 0.4)

5.2.2 Observation weights

myPLN_weighted <-
  PLN(
    PLN_data$Abundance ~ 1 + Ecological.gradient + offset(log(Offset)),
    data    = PLN_data,
    weights = runif(nrow(PLN_data)),
    control = PLN_param(trace = 0)
  )
data.frame(
  unweighted = as.vector(fitted(Model_06)),
  weighted   = as.vector(fitted(myPLN_weighted))
) %>%
  ggplot(aes(x = unweighted, y = weighted)) +
  geom_point(size = .5, alpha = .25) +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  theme_bw() +
  annotation_logticks() +
  labs(x = "Unweighted Fitted Values", y = "Weighted Fitted Values")

Effect of Small Weights: Observations with very small weights might be receiving much less influence in the weighted model. If an observation has a weight close to zero, its contribution to the fitted values will be minimal. This can lead to significant deviations between the unweighted and weighted models, especially if the unweighted model provides a much larger fitted value for those observations.

General Linear Trend: Most observations align with a linear relationship, indicating that the weights are generally applied in a consistent manner across the dataset. Distinct Clouds: The additional clouds suggest that specific subsets of data behave differently, either due to the extreme or moderate influence of weights, unique characteristics of those observations, or potential data issues.

PLN_data <- PLN_data %>% as.data.frame()
# Extract unweighted fitted values
a <- fitted(Model_06) %>% as.data.frame()
a <- a %>%
  mutate(Sample = rownames(PLN_data)) %>%
  column_to_rownames("Sample")
colnames(a) <- PLN_data$Abundance %>% colnames()

unweighted_df <- a %>%
  mutate(Ecological.gradient =  PLN_data$Ecological.gradient) %>%
  rownames_to_column("Sample") %>%
  pivot_longer(-c(Sample, Ecological.gradient), names_to = "Variable", values_to = "Unweighted")

# Extract weighted fitted values
b <- fitted(myPLN_weighted) %>% as.data.frame()
b <- b %>%
  mutate(Sample = rownames(PLN_data)) %>%
  column_to_rownames("Sample")
colnames(b) <- PLN_data$Abundance %>% colnames()

weighted_df <- b %>%
  mutate(Ecological.gradient = PLN_data$Ecological.gradient) %>%
  rownames_to_column("Sample") %>%
  pivot_longer(-c(Sample, Ecological.gradient), names_to = "Variable", values_to = "Weighted")

# Merge the unweighted and weighted data
merged_df <- unweighted_df %>%
  select(Sample, Variable, Unweighted) %>%
  dplyr::left_join(weighted_df, by = c("Sample", "Variable")) %>%
  select(Sample, Variable, Unweighted, Weighted, Ecological.gradient)

# Inspect the merged data frame
head(merged_df)
## # A tibble: 6 × 5
##   Sample Variable                        Unweighted Weighted Ecological.gradient
##   <chr>  <chr>                                <dbl>    <dbl> <chr>              
## 1 GI-52  Acinetobacter                      0.00238  0.0168  ecotone            
## 2 GI-52  Allorhizobium-Neorhizobium-Par…    0.0258   0.00118 ecotone            
## 3 GI-52  Anoxybacillus                      0.0552   0.160   ecotone            
## 4 GI-52  Arsenophonus                       0.0956   0.103   ecotone            
## 5 GI-52  Bacillus                           0.133    0.832   ecotone            
## 6 GI-52  Buchnera                          34.0     35.6     ecotone
# Define your color palette
colors <- c(
  "wild" = "#009392",
  "ecotone" = "#9ccb86",
  "farmed land" = "#EEB479",
  "urban" = "#CF597E"
)

# Plot with transparency
ggplot(merged_df, aes(x = Unweighted, y = Weighted, color = Ecological.gradient)) +
  geom_point(size = 1, alpha = 0.4) + # Adjust alpha for transparency
  scale_x_log10() +
  scale_y_log10() +
  scale_color_manual(values = colors) +
  theme_bw() +
  labs(
    x = "Unweighted Fitted Value",
    y = "Weighted Fitted Value",
    color = "Ecological.gradient"
  ) +
  theme(legend.position = "bottom")

5.2.3 Investigating the cloud with larger weights on the x axis

results_df <- data.frame(
  unweighted = as.vector(fitted(Model_06)),
  weighted = as.vector(fitted(myPLN_weighted)),
  "Sample" = rownames(PLN_data) # Assuming sample IDs are rownames or add another identifier
)
# Filter the dataframe based on the conditions
filtered_samples <- merged_df %>% filter(Unweighted > 1e-5 & Weighted < 1e-4)

# Print the filtered samples
head(filtered_samples)
## # A tibble: 6 × 5
##   Sample Variable                Unweighted     Weighted Ecological.gradient
##   <chr>  <chr>                        <dbl>        <dbl> <chr>              
## 1 GI-61  Candidatus Baumannia      0.212    0.0000445    farmed land        
## 2 GI-61  Candidatus Ishikawaella   0.0461   0.0000450    farmed land        
## 3 GI-62  Arsenophonus              0.00235  0.0000487    farmed land        
## 4 GI-62  Candidatus Baumannia      0.0200   0.0000000851 farmed land        
## 5 GI-62  Candidatus Ishikawaella   0.00144  0.0000000786 farmed land        
## 6 GI-62  Halomonas                 0.000328 0.0000123    farmed land

5.2.4 Investigating the cloud top left corner.

# Filter the dataframe based on the conditions
filtered_samples <- merged_df %>% filter(Unweighted < 1e-4 & Weighted > 1e-6)

# Print the filtered samples
head(filtered_samples)
## # A tibble: 6 × 5
##   Sample Variable                        Unweighted Weighted Ecological.gradient
##   <chr>  <chr>                                <dbl>    <dbl> <chr>              
## 1 GI-53  Cupriavidus                        5.12e-7  3.37e-4 urban              
## 2 GI-54  Cupriavidus                        5.35e-7  2.13e-4 ecotone            
## 3 GI-56  Allorhizobium-Neorhizobium-Par…    6.15e-5  5.26e-3 ecotone            
## 4 GI-56  Cupriavidus                        2.26e-9  1.91e-4 ecotone            
## 5 GI-62  Cupriavidus                        9.58e-9  5.24e-5 farmed land        
## 6 GI-64  Jeotgalibaca                       3.70e-5  3.77e-4 farmed land

5.3 PCA

ticks_pca_clim <- PLNPCA(
  formula = PLN_data$Abundance ~ 1 +
    Ecological.gradient +
    offset(log(Offset)),
  data = PLN_data,
  ranks = 1:20
)
## 
##  Initialization...
## 
##  Adjusting 20 PLN models for PCA analysis.
##   Rank approximation = 2      Rank approximation = 1      Rank approximation = 16     Rank approximation = 13     Rank approximation = 20     Rank approximation = 17     Rank approximation = 12     Rank approximation = 3      Rank approximation = 8      Rank approximation = 14     Rank approximation = 11     Rank approximation = 4      Rank approximation = 7      Rank approximation = 19     Rank approximation = 18     Rank approximation = 15     Rank approximation = 10     Rank approximation = 6      Rank approximation = 5      Rank approximation = 9 
##  Post-treatments
##  DONE!
plot(ticks_pca_clim)

ticks_pca_clim_best <- getBestModel(ticks_pca_clim, crit = "ICL")
plot(ticks_pca_clim_best, map = "individual", axes = 1:2, ind_cols = PLN_data$Ecological.gradient)

sigma(ticks_pca_clim_best) %>%
  corrplot::corrplot(is.corr = FALSE, tl.cex = 0.5, col = corrplot::COL2("PiYG"), type = "lower", diag = T)

# Define the threshold function
keep_high_correlations <- function(x) {
  any(x > 2 | x < -1)
}

########################### Climatic ##################
rows_to_keep <- apply(sigma(ticks_pca_clim_best), 1, keep_high_correlations)
cols_to_keep <- apply(sigma(ticks_pca_clim_best), 2, keep_high_correlations)

# Subset the matrix to keep only the selected rows and columns
ticks_pca_clim_best_filtered <- sigma(ticks_pca_clim_best)[rows_to_keep, cols_to_keep]

# Print the head of the filtered matrix to check
# head(oaks_best_pca_filtered)
ticks_pca_clim_best_filtered %>%
  corrplot::corrplot(is.corr = FALSE, tl.cex = 0.4, col = corrplot::COL2("PiYG"), type = "lower", diag = T)

factoextra::fviz_pca_biplot(ticks_pca_clim_best,
  select.var = list(contrib = 10),
  labels = NULL,
  geom = "point"
) # CONTRIBUTION TOP 20 CLUSTER.

fviz_pca_var(ticks_pca_clim_best,
  col.var = "contrib",
  gradient.cols = c("#00AFBB",
                    "#E7B800",
                    "#FC4E07"),
  select.var = list(contrib = 20)
)

ticks_pca_clim_best$var %>%
  as.data.frame() %>%
  select(contrib.Dim.1, contrib.Dim.2) %>%
  arrange(contrib.Dim.1) %>%
  tail(n = 10) %>%
  knitr::kable()
contrib.Dim.1 contrib.Dim.2
Staphylococcus 4.275688 0.2012983
Planococcus 5.210675 6.6617925
Streptococcus 5.665843 0.0640939
Fictibacillus 6.018175 2.7755791
Enterococcus 6.122822 2.3719877
Anoxybacillus 6.426533 0.4444764
Halomonas 7.196764 0.0476194
Arsenophonus 7.342354 13.4084864
Candidatus Ishikawaella 11.073589 13.1191328
Candidatus Baumannia 13.497340 7.8311907
fviz_contrib(ticks_pca_clim_best, choice = "var", axes = 1:2, top = 10)

5.4 Network

networks_full <- PLNnetwork(formula = PLN_data$Abundance ~ 1 +
  Ecological.gradient +
  offset(log(Offset)), data = PLN_data)
## 
##  Initialization...
##  Adjusting 30 PLN with sparse inverse covariance estimation
##  Joint optimization alternating gradient descent and graphical-lasso
##  sparsifying penalty = 44.26583  sparsifying penalty = 40.88706  sparsifying penalty = 37.76618  sparsifying penalty = 34.88352  sparsifying penalty = 32.22089  sparsifying penalty = 29.7615   sparsifying penalty = 27.48983  sparsifying penalty = 25.39155  sparsifying penalty = 23.45344  sparsifying penalty = 21.66326  sparsifying penalty = 20.00972  sparsifying penalty = 18.48239  sparsifying penalty = 17.07165  sparsifying penalty = 15.76859  sparsifying penalty = 14.56498  sparsifying penalty = 13.45325  sparsifying penalty = 12.42637  sparsifying penalty = 11.47788  sparsifying penalty = 10.60178  sparsifying penalty = 9.792559  sparsifying penalty = 9.045101  sparsifying penalty = 8.354696  sparsifying penalty = 7.716989  sparsifying penalty = 7.127958  sparsifying penalty = 6.583887  sparsifying penalty = 6.081345  sparsifying penalty = 5.617161  sparsifying penalty = 5.188408  sparsifying penalty = 4.792381  sparsifying penalty = 4.426583 
##  Post-treatments
##  DONE!

5.4.1 Assessment of the model

networks_full$criteria %>%
  head() %>%
  knitr::kable()
param nb_param loglik BIC ICL n_edges EBIC pen_loglik density stability
44.26583 135 -3764.443 -4027.141 -5578.442 0 -4027.141 -3785.435 0.000000 NA
40.88706 135 -3732.180 -3994.878 -5546.361 0 -3994.878 -3752.830 0.000000 NA
37.76618 135 -3701.010 -3963.707 -5515.199 0 -3963.707 -3721.305 0.000000 NA
34.88352 135 -3670.830 -3933.527 -5485.028 0 -3933.527 -3690.760 0.000000 NA
32.22089 135 -3641.636 -3904.334 -5455.858 0 -3904.334 -3661.190 0.000000 NA
29.76150 137 -3612.872 -3879.462 -5431.025 2 -3884.630 -3632.061 0.005487 NA

A diagnostic of the optimization process is available via the `convergence field:

networks_full$convergence %>%
  head() %>%
  knitr::kable()
param nb_param status backend objective iterations convergence
out 44.26583 135 3 nlopt 3764.442793 2.000000 0.000007
elt 40.88706 135 4 nlopt 3732.180189 3.000000 0.000004
elt.1 37.76618 135 4 nlopt 3701.009586 2.000000 0.000001
elt.2 34.88352 135 4 nlopt 3670.829585 2.000000 0.000000
elt.3 32.22089 135 3 nlopt 3641.636475 2.000000 0.000003
elt.4 29.76150 137 4 nlopt 3612.872379 2.000000 0.000002

convergence: This column shows the convergence tolerance or criterion. It represents how close the algorithm was to meeting the convergence criteria, with smaller values indicating closer adherence to the convergence criteria. Values close to zero (like the ones shown here) indicate that the models have converged successfully.

plot(networks_full, "diagnostic")

5.4.2 Exploring the path of networks

plot(networks_full, reverse = TRUE)

coefficient_path(networks_full, corr = TRUE) %>%
  ggplot(aes(x = Penalty, y = Coeff, group = Edge, colour = Edge)) +
  geom_line(show.legend = FALSE) +
  coord_trans(x = "log10") +
  theme_bw()

How to Interpret:

  • As the penalty increases (moves right on the x-axis), the regularization strength increases, typically leading to more coefficients shrinking towards zero.
  • Lines that stay above zero or deviate less from zero indicate more robust and significant relationships between variables, as these edges are less affected by regularization.
  • Lines that move towards zero quickly as the penalty increases indicate weaker relationships that are regularized out of the model early.

5.4.3 Choosing a network

model_StARS <- getBestModel(networks_full, "StARS")
## 
## Stability Selection for PLNnetwork: 
## subsampling: ++++++++++++++++++++
plot(networks_full, "stability")

5.4.4 Structure of a PLNnetworkfit

model_StARS
## Poisson Lognormal with sparse inverse covariance (penalty = 14.6)
## ==================================================================
##  nb_param    loglik       BIC       ICL n_edges      EBIC pen_loglik density
##       154 -3368.619 -3668.289 -5220.122      19 -3695.994  -3385.108   0.052
## ==================================================================
## * Useful fields
##     $model_par, $latent, $latent_pos, $var_par, $optim_par
##     $loglik, $BIC, $ICL, $loglik_vec, $nb_param, $criteria
## * Useful S3 methods
##     print(), coef(), sigma(), vcov(), fitted()
##     predict(), predict_cond(), standard_error()
## * Additional fields for sparse network
##     $EBIC, $density, $penalty 
## * Additional S3 methods for network
##     plot.PLNnetworkfit()

We can finally check that the fitted value of the counts – even with sparse regularization of the covariance matrix – are close to the observed ones:

data.frame(
  fitted   = as.vector(fitted(model_StARS)),
  observed = as.vector(PLN_data$Abundance)
) %>%
  ggplot(aes(x = observed, y = fitted)) +
  geom_point(size = .5, alpha = .25) +
  scale_x_log10(limits = c(1, 4000)) +
  scale_y_log10(limits = c(1, 4000)) +
  theme_bw() +
  annotation_logticks()
## Warning in scale_x_log10(limits = c(1, 4000)): log-10 transformation introduced
## infinite values.
## Warning: Removed 1046 rows containing missing values or values outside the scale range
## (`geom_point()`).

5.4.5 Stability threashold = 0.9

ticks_best_network_2 <- model_StARS
my_graph <- plot(ticks_best_network_2, plot = F)
Isolated <- which(igraph::degree(my_graph) == 0)
my_graph_cleaned <- delete_vertices(my_graph, Isolated)

my_graph <- my_graph_cleaned

vertices_names <- V(my_graph)$name

result <- merged_metagenomes@tax_table %>% 
  as.data.frame() %>%
  filter(Genus %in% vertices_names) %>%
  select(Genus, Family)

# Extract unique families
unique_families <- unique(result$Family)
# Generate distinct colors for each family
n_colors <- length(unique_families)
colors <- khroma::colour("smooth rainbow")(n_colors)
# Create a named vector to map each family to a color
family_colors <- setNames(colors, unique_families)
# Add the color information to the result dataframe
result <- result %>%
  mutate(Color = family_colors[Family])

# extract abundance OTU
otu_table_df <- as.data.frame(otu_table(physeq))
abundance <- otu_table_df %>%
  mutate(Abundance = rowSums(.)) %>%
  rownames_to_column("vertices_left")
abundance_solo <- abundance %>%
  as.data.frame() %>%
  filter(vertices_left %in% vertices_names) %>%
  select(Abundance)

# # Convert the Abundance column to a numeric vector
# vertex_sizes <- as.numeric(abundance_solo$Abundance)
# # Scale the vertex sizes for better visibility
# vertex_sizes <- (vertex_sizes / max(vertex_sizes)) * 10  # Adjust the multiplier as needed
# Assign colors to the vertices
V(my_graph)$color <- result$Color
# Assign sizes to the vertices
# V(my_graph)$size <- vertex_sizes
# Choose a layout that spreads out the vertices
layout <- igraph::layout_in_circle(my_graph)

# Define unique colors and families for the legend
unique_colors <- unique(result$Color)
unique_families <- unique(result$Family)

# Define edge width for the legend
edge_widths <- E(my_graph)$width
unique_edge_widths <- unique(edge_widths)

# Edge - Arc colors
weight <- as.numeric(E(my_graph)$weight)
color_palette <- colorRampPalette(c("yellow", "red"))(200)
normalized_weights <- (weight - min(weight)) / (max(weight) - min(weight))
edge_colors <- sapply(seq_along(normalized_weights), function(i) {
  if (i == 97) {
    "#009fff"
  } else {
    color_palette[as.integer(normalized_weights[i] * (length(color_palette) - 1)) + 1]
  }
})
edge_attr(my_graph)$color <- edge_colors

E(my_graph)$color <- edge_colors

plot(simplify(my_graph),
  layout = layout,
  vertex.size = (V(my_graph)$size) * 2,
  vertex.label.dist = 1,
  vertex.label.cex = 0.5,
  edge.width = E(my_graph)$width,
  edge.color = E(my_graph)$color,
  margin = 0.2,
  main = "Network based on the GLM ~ month + Offset by sex group",
  sub = "Stability threashold 0.20 - Verticles colored by families, Edges colored by weight",
  rescale = T
)
legend("left",
  legend = unique_families,
  col = unique_colors,
  pch = 19,
  title = "Families",
  pt.cex = 2,
  cex = 0.5,
  bty = "n",
  y.intersp = 0.6,
  inset = 1 / 10
)

my_graph_metaN <- c_net_update(my_graph)
## a 'v_group'-'v_class' should not match multiple colors, update 'color'.
## a 'e_type' should not match multiple colors, update 'color'.
V(my_graph_metaN)$v_group <- result$Family
V(my_graph_metaN)$color <- V(my_graph)$color
V(my_graph_metaN)$size <- vertex_attr(my_graph)$size
V(my_graph_metaN)$v_class <- result$Family
E(my_graph_metaN)$color <- E(my_graph)$color

# Set the background color to black using par and text parameters
par(bg = "white", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")

c_net_plot(my_graph_metaN,
           vertex.color = V(my_graph_metaN)$color, vertex.size = ((V(my_graph_metaN)$size)*2), 
           vertex.label.color = "black",vertex.label = V(my_graph_metaN)$name, vertex.label.cex = 0.5,
           edge_width_range = c(0.5, 3), edge.color = "green4", edge.curved = 0.4,
           legend = F, legend_number = F, 
           edge_legend_title = "Weight",
           size_legend = F, size_legend_title = "Abundance",
           width_legend = T, width_legend_title = "Weight"
           )
# vertex_attr(my_graph_metaN)
# vertex_attr(my_graph)
legend("right",
  legend = unique_families,
  col = unique_colors,
  # title = "Families",
  pch = 19,
  pt.cex = 2,
  cex = 0.6,
  text.font = 3,
  bty = "n",
  y.intersp = 0.4,
  inset = 6 / 10
)
legend_x <- 0.85
legend_y <- 0.9 
text(x = legend_x - 2.5, y = legend_x - 0.35, labels = "Families", pos = 1, cex = 0.8, font = 4)

# 
# # Define the thickness range
# min_thickness <- 0.5
# max_thickness <- 3
# 
# # Calculate the thickness for each weight
# thickness_values <- min_thickness + (max_thickness - min_thickness) * normalized_weights
# 
# # Create a unique set of thickness values for the legend
# unique_thickness <- unique(round(thickness_values,digits = 0))
# 
# # Generate labels for the thickness values
# thickness_labels <- round(unique_thickness, 2)
# 
# # Add the legend for the weights
# legend("right",
#        legend = thickness_labels,
#        col = "black",
#        lty = 1, # Line type
#        lwd = unique_thickness, # Line width corresponding to thickness
#        title = "Weights",
#        cex = 0.5, y.intersp = 0.5,
#        text.font = 3 # Italic text
# )
V(my_graph_metaN)$v_class <- result$Family

c_net_plot(my_graph_metaN,  
           coors = g_layout_polygon(my_graph_metaN, group = "v_class")
           )

5.4.6 Arc diagram

## get edgelist
edgelist = as_edgelist(my_graph)

order_species <- c("Trueperella_1","Corynebacterium_6", "Fusobacterium_2","Corynebacterium_1", "Corynebacterium_3", "Corynebacterium_4", "Corynebacterium_8", 
                   "Streptococcus_1", "Peptoniphilus_1", "Mycobacterium_1", "Rhodococcus_1", "Multi-affiliation1_1", "Sphingomonas_1", 
                   "Francisella_10", "Francisella_6", "Francisella_9", "Williamsia_2", "Corynebacterium_5", "Multi-affiliation_1", "Corynebacterium_7",
                   "Halomonas_2", "Mycobacterium_2", "Williamsia_1", "Nocardioides_1",  "Francisella_8", "Xanthomonas_1", "Acinetobacter_2", 
                   "Staphylococcus_3", "Staphylococcus_4", "Acinetobacter_1", "Candidatus Midichloria_1", "Francisella_4", "Francisella_5", 
                   "Candidatus Midichloria_2", "Candidatus Midichloria_6", "Candidatus Midichloria_7") 

result <- result %>%  mutate(size = vertex_attr(my_graph)$size)
result_ordered <- result[match(order_species, result$Genus), ]

# Set the background color to black using par and text parameters
par(bg = "black", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")
# plot arc diagram
arcplot(edgelist, 
        cex.labels = 0.7,
        sorted = FALSE,
        show.nodes = TRUE,
        bg.nodes = result_ordered$Color,
        cex.nodes = (result_ordered$size) * 0.8, 
        pch.nodes = 21, 
        lwd.nodes = 0.2, 
        line = 0,
        lwd.arcs = 20 * (edge_attr(my_graph)$weight),
        col.arcs = edge_attr(my_graph)$color,
        col.labels = "lightgrey", # Set font color for labels
        col.nodes = "black",  # Set font color for node labels
        bg = NA # Use NA to keep the par bg color
#        above = c(1:97,99:114)
)
legend("right",
       legend = unique_families,
       col = unique_colors,
       pch = 19,
       title = "Families",
       pt.cex = 2,
       cex = 0.7,
       text.font = 3, # Set font to Times New Roman (serif)
       text.col = "lightgrey", # Set font color to light grey
       bty = "n",
       y.intersp = 0.5
    #   inset = 1 / 10
)

6 Female Data Set

6.1 PCA

PLN_pca_F <- PLNPCA(
  formula = data_F$Abundance ~ 1 +
    offset(log(Offset)),
  data = data_F,
  ranks = 1:20
)
## 
##  Initialization...
## 
##  Adjusting 20 PLN models for PCA analysis.
##   Rank approximation = 5      Rank approximation = 12     Rank approximation = 15     Rank approximation = 9      Rank approximation = 20     Rank approximation = 6      Rank approximation = 4      Rank approximation = 2      Rank approximation = 7      Rank approximation = 18     Rank approximation = 10     Rank approximation = 11     Rank approximation = 19     Rank approximation = 17     Rank approximation = 14     Rank approximation = 8      Rank approximation = 16     Rank approximation = 1      Rank approximation = 13     Rank approximation = 3 
##  Post-treatments
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## 
##  DONE!
plot(PLN_pca_F)

PLN_pca_F_best <- getBestModel(PLN_pca_F, crit = "BIC")
plot(PLN_pca_F_best, map = "individual", axes = 1:2, ind_cols = data_F$Ecological.gradient)

sigma(PLN_pca_F_best) %>%
  corrplot::corrplot(is.corr = FALSE, tl.cex = 0.5, col = corrplot::COL2("PiYG"), type = "lower", diag = T)

# Define the threshold function
keep_high_correlations <- function(x) {
  any(x > 2 | x < -1)
}

########################### Climatic ##################
rows_to_keep <- apply(sigma(PLN_pca_F_best), 1, keep_high_correlations)
cols_to_keep <- apply(sigma(PLN_pca_F_best), 2, keep_high_correlations)

# Subset the matrix to keep only the selected rows and columns
PLN_pca_F_best_filtered <- sigma(PLN_pca_F_best)[rows_to_keep, cols_to_keep]

# Print the head of the filtered matrix to check
# head(oaks_best_pca_filtered)
PLN_pca_F_best_filtered %>%
  corrplot::corrplot(is.corr = FALSE, tl.cex = 0.4, col = corrplot::COL2("PiYG"), type = "lower", diag = T)

factoextra::fviz_pca_biplot(PLN_pca_F_best,
  select.var = list(contrib = 10),
  labels = NULL,
  geom = "point"
) # CONTRIBUTION TOP 20 CLUSTER.

# Define your color palette
colors <- c(
  "wild" = "#009392",
  "ecotone" = "#9ccb86",
  "farmed land" = "#EEB479",
  "urban" = "#CF597E"
)

# Plot PCA with ordered legend
fviz_pca_ind(PLN_pca_F_best,
  col.ind = data_F$Ecological.gradient,
  palette = colors,
  geom.ind = "point",
  repel = TRUE,
  pointsize = 4,
  pointshape = 19,
  col.var = "black",
  arrowsize = 0.6,
  labelsize = 4
) +
  labs(color = "Ecological gradient") +
  scale_color_manual(values = colors) +
  guides(color = guide_legend(order = 1))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

fviz_pca_var(PLN_pca_F_best,
  col.var = "contrib",
  gradient.cols = c("#00AFBB",
                    "#E7B800",
                    "#FC4E07"),
  select.var = list(contrib = 20)
)

PLN_pca_F_best$var %>%
  as.data.frame() %>%
  select(contrib.Dim.1, contrib.Dim.2) %>%
  arrange(contrib.Dim.1) %>%
  tail(n = 10) %>%
  knitr::kable()
contrib.Dim.1 contrib.Dim.2
Wigglesworthia 0.4823455 1.6832498
Buchnera 0.4939708 1.8357243
Staphylococcus 0.5513401 1.4168548
Fictibacillus 0.5899444 0.0569365
Candidatus Curculioniphilus 0.8234310 1.7290751
Cupriavidus 1.0517940 43.7093708
Candidatus Ishikawaella 6.8902259 0.0589862
Candidatus Baumannia 10.0833813 3.3880136
Arsenophonus 17.7778363 0.3021989
Halomonas 57.7739258 0.4224482
fviz_contrib(PLN_pca_F_best, choice = "var", axes = 1:2, top = 10)

6.2 Network

6.2.1 Modeling

networks_F <- PLNnetwork(formula = data_F$Abundance ~ 1 +
  Ecological.gradient  +
  offset(log(Offset)), data = data_F)
## 
##  Initialization...
##  Adjusting 30 PLN with sparse inverse covariance estimation
##  Joint optimization alternating gradient descent and graphical-lasso
##  sparsifying penalty = 20.55506  sparsifying penalty = 18.98611  sparsifying penalty = 17.53692  sparsifying penalty = 16.19834  sparsifying penalty = 14.96194  sparsifying penalty = 13.8199   sparsifying penalty = 12.76504  sparsifying penalty = 11.7907   sparsifying penalty = 10.89072  sparsifying penalty = 10.05944  sparsifying penalty = 9.291615  sparsifying penalty = 8.582394  sparsifying penalty = 7.927308  sparsifying penalty = 7.322223  sparsifying penalty = 6.763324  sparsifying penalty = 6.247085  sparsifying penalty = 5.770251  sparsifying penalty = 5.329812  sparsifying penalty = 4.922992  sparsifying penalty = 4.547225  sparsifying penalty = 4.200139  sparsifying penalty = 3.879546  sparsifying penalty = 3.583423  sparsifying penalty = 3.309904  sparsifying penalty = 3.057262  sparsifying penalty = 2.823903  sparsifying penalty = 2.608357  sparsifying penalty = 2.409264  sparsifying penalty = 2.225367  sparsifying penalty = 2.055506 
##  Post-treatments
##  DONE!
plot(networks_F, "diagnostic")

coefficient_path(networks_F, corr = TRUE) %>%
  ggplot(aes(x = Penalty, y = Coeff, group = Edge, colour = Edge)) +
  geom_line(show.legend = FALSE) +
  coord_trans(x = "log10") +
  theme_bw()

model_StARS <- getBestModel(networks_F, "StARS")
## 
## Stability Selection for PLNnetwork: 
## subsampling: ++++++++++++++++++++
plot(networks_F, "stability")

data.frame(
  fitted   = as.vector(fitted(model_StARS)),
  observed = as.vector(count_f2)
) %>%
  ggplot(aes(x = observed, y = fitted)) +
  geom_point(size = .5, alpha = .25) +
  scale_x_log10(limits = c(1, 4000)) +
  scale_y_log10(limits = c(1, 4000)) +
  theme_bw() +
  annotation_logticks()

6.3 Visualisation

networks_F_2 <- getBestModel(networks_F, crit = "StARS", stability = 0.9)

my_graph <- plot(networks_F_2, plot = F)
Isolated <- which(igraph::degree(my_graph) == 0)
my_graph_cleaned <- igraph::delete_vertices(my_graph, Isolated)
# my_graph_cleaned <- delete_vertices(my_graph, c("Francisella_8", "Candidatus Midichloria_11"))

my_graph <- my_graph_cleaned

vertices_names <- V(my_graph)$name
taxo <- tax_table(physeq)

result <- merged_metagenomes@tax_table %>% 
  as.data.frame() %>%
  filter(Genus %in% vertices_names) %>%
  select(Genus, Family)

# Extract unique families
unique_families <- unique(result$Family)
# Generate distinct colors for each family
n_colors <- length(unique_families)
colors <- khroma::colour("smooth rainbow")(n_colors)
# Create a named vector to map each family to a color
family_colors <- setNames(colors, unique_families)
# Add the color information to the result dataframe
result <- result %>%
  mutate(Color = family_colors[Family])

# extract abundance OTU
otu_table_df <- as.data.frame(otu_table(physeq))
abundance <- otu_table_df %>%
  mutate(Abundance = rowSums(.)) %>%
  rownames_to_column("vertices_left")
abundance_solo <- abundance %>%
  as.data.frame() %>%
  filter(vertices_left %in% vertices_names) %>%
  select(Abundance)

# # Convert the Abundance column to a numeric vector
# vertex_sizes <- as.numeric(abundance_solo$Abundance)
# # Scale the vertex sizes for better visibility
# vertex_sizes <- (vertex_sizes / max(vertex_sizes)) * 10  # Adjust the multiplier as needed
# Assign colors to the vertices
V(my_graph)$color <- result$Color
# Assign sizes to the vertices
# V(my_graph)$size <- vertex_sizes
# Choose a layout that spreads out the vertices
layout <- igraph::layout_in_circle(my_graph)

# Define unique colors and families for the legend
unique_colors <- unique(result$Color)
unique_families <- unique(result$Family)

# Define edge width for the legend
edge_widths <- E(my_graph)$width
unique_edge_widths <- unique(edge_widths)

# Edge - Arc colors
weight <- as.numeric(E(my_graph)$weight)
color_palette <- colorRampPalette(c("yellow", "red"))(200)
normalized_weights <- (weight - min(weight)) / (max(weight) - min(weight))
edge_colors <- sapply(seq_along(normalized_weights), function(i) {
  if (i == 97) {
    "#009fff"
  } else {
    color_palette[as.integer(normalized_weights[i] * (length(color_palette) - 1)) + 1]
  }
})
edge_attr(my_graph)$color <- edge_colors

E(my_graph)$color <- edge_colors

plot(simplify(my_graph),
  layout = layout,
  vertex.size = (V(my_graph)$size) * 2,
  vertex.label.dist = 1,
  vertex.label.cex = 0.5,
  edge.width = E(my_graph)$width,
  edge.color = E(my_graph)$color,
  margin = 0.2,
  main = "Network based on the GLM ~ month",
  sub = "Stability threashold 0.90 - Verticles colored by families, Edges colored by weight",
  rescale = T
)
legend("left",
  legend = unique_families,
  col = unique_colors,
  pch = 19,
  title = "Families",
  pt.cex = 2,
  cex = 0.5,
  bty = "n",
  y.intersp = 0.6,
  inset = 1 / 10
)

edgelist = as_edgelist(my_graph)

order_species <- c("Corynebacterium_3", "Corynebacterium_8",  "Fusobacterium_2", "Corynebacterium_3", "Corynebacterium_4", "Francisella_10", "Francisella_9", "Corynebacterium_6", "Mycobacterium_2", "Rhodococcus_1" , "Williamsia_1" , "Williamsia_2",  "Nocardioides_1" , "Sphingomonas_1", "Streptococcus_1", "Multi-affiliation_1", "Staphylococcus_3", "Staphylococcus_4","Staphylococcus_6" , "Peptoniphilus_1", "Candidatus Midichloria_2", "Candidatus Midichloria_6", "Candidatus Midichloria_7", "Candidatus Midichloria_3", "Rickettsia_3", "Candidatus Midichloria_4", "Candidatus Midichloria_5", "Rickettsia_1", "Rickettsia_11", "Rickettsia_2", "Rickettsia_6","Rickettsia_10", "Candidatus Midichloria_9","Rickettsia_7", "Rickettsia_7")


result <- result %>%  mutate(size = vertex_attr(my_graph)$size)
result_ordered <- result[match(order_species, result$Genus), ]

# Set the background color to black using par and text parameters
par(bg = "black", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")
# plot arc diagram
arcplot(edgelist, 
        cex.labels = 0.7,
        sorted = FALSE,
        show.nodes = TRUE,
        bg.nodes = result_ordered$Color,
        cex.nodes = (result_ordered$size) * 0.8, 
        pch.nodes = 21, 
        lwd.nodes = 0.2, 
        line = 0,
        lwd.arcs = 20 * (edge_attr(my_graph)$weight),
        col.arcs = edge_attr(my_graph)$color,
        col.labels = "lightgrey", # Set font color for labels
        col.nodes = "black",  # Set font color for node labels
        bg = NA # Use NA to keep the par bg color
        #        above = c(1:97,99:114)
)
legend("topright",
       legend = unique_families,
       col = unique_colors,
       pch = 19,
       title = "Families",
       pt.cex = 2,
       cex = 0.7,
       text.font = 3, # Set font to Times New Roman (serif)
       text.col = "lightgrey", # Set font color to light grey
       bty = "n",
       y.intersp = 0.45,
       inset = c(-0.6, -0.02))

my_graph_metaN <- c_net_update(my_graph)
## a 'v_group'-'v_class' should not match multiple colors, update 'color'.
## a 'e_type' should not match multiple colors, update 'color'.
V(my_graph_metaN)$v_group <- result$Family
V(my_graph_metaN)$color <- V(my_graph)$color
V(my_graph_metaN)$size <- vertex_attr(my_graph)$size
V(my_graph_metaN)$v_class <- result$Family
E(my_graph_metaN)$color <- E(my_graph)$color

# Set the background color to black using par and text parameters
par(bg = "white", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")

c_net_plot(my_graph_metaN,
           vertex.color = V(my_graph_metaN)$color, 
           vertex.size = ((V(my_graph_metaN)$size)*1), 
           vertex.label.color = "black",
           vertex.label = V(my_graph_metaN)$name, 
           vertex.label.cex = 0.9,
           edge_width_range = c(0.5, 3), edge.color = "#AA0000", edge.curved = 0.4,
           legend = F, legend_number = F, 
           edge_legend_title = "Weight",
           size_legend = F, size_legend_title = "Abundance",
           width_legend = T, width_legend_title = "Weight", 
           main = "Female Network"
           )
# vertex_attr(my_graph_metaN)
# vertex_attr(my_graph)
legend("right",
  legend = unique_families,
  col = unique_colors,
  # title = "Families",
  pch = 19,
  pt.cex = 2,
  cex = 0.6,
  text.font = 3,
  bty = "n",
  y.intersp = 0.4,
  inset = 8 / 10
)
legend_x <- 0.85
legend_y <- 0.9 
text(x = legend_x - 2.5, y = legend_x - 0.35, labels = "Families", pos = 1, cex = 0.8, font = 4)

7 Male Data Set

7.1 PCA

PLN_pca_M <- PLNPCA(
  formula = data_M$Abundance ~ 1 +
    offset(log(Offset)),
  data = data_M,
  ranks = 1:20
)
## 
##  Initialization...
## 
##  Adjusting 20 PLN models for PCA analysis.
##   Rank approximation = 16     Rank approximation = 10     Rank approximation = 8      Rank approximation = 3      Rank approximation = 6      Rank approximation = 7      Rank approximation = 12     Rank approximation = 2      Rank approximation = 20     Rank approximation = 1      Rank approximation = 5      Rank approximation = 15     Rank approximation = 17     Rank approximation = 18     Rank approximation = 14     Rank approximation = 11     Rank approximation = 9      Rank approximation = 13     Rank approximation = 19     Rank approximation = 4 
##  Post-treatments
##  DONE!
plot(PLN_pca_M)

PLN_pca_M_best <- getBestModel(PLN_pca_M, crit = "BIC")
plot(PLN_pca_M_best, map = "individual", axes = 1:2, ind_cols = data_M$Ecological.gradient)

sigma(PLN_pca_M_best) %>%
  corrplot::corrplot(is.corr = FALSE, tl.cex = 0.5, col = corrplot::COL2("PiYG"), type = "lower", diag = T)

# Define the threshold function
keep_high_correlations <- function(x) {
  any(x > 2 | x < -1)
}

########################### Climatic ##################
rows_to_keep <- apply(sigma(PLN_pca_M_best), 1, keep_high_correlations)
cols_to_keep <- apply(sigma(PLN_pca_M_best), 2, keep_high_correlations)

# Subset the matrix to keep only the selected rows and columns
PLN_pca_M_best_filtered <- sigma(PLN_pca_M_best)[rows_to_keep, cols_to_keep]

# Print the head of the filtered matrix to check
# head(oaks_best_pca_filtered)
PLN_pca_M_best_filtered %>%
  corrplot::corrplot(is.corr = FALSE, tl.cex = 0.4, col = corrplot::COL2("PiYG"), type = "lower", diag = T)

factoextra::fviz_pca_biplot(PLN_pca_M_best,
                            select.var = list(contrib = 10),
                            labels = NULL,
                            geom = "point"
) # CONTRIBUTION TOP 20 CLUSTER.

# Define your color palette
colors <- c(
  "wild" = "#009392",
  "ecotone" = "#9ccb86",
  "farmed land" = "#EEB479",
  "urban" = "#CF597E"
)

# Plot PCA with ordered legend
fviz_pca_ind(PLN_pca_M_best,
             col.ind = data_M$Ecological.gradient,
             palette = colors,
             geom.ind = "point",
             repel = TRUE,
             pointsize = 4,
             pointshape = 19,
             col.var = "black",
             arrowsize = 0.6,
             labelsize = 4
) +
  labs(color = "Ecological gradient") +
  scale_color_manual(values = colors) +
  guides(color = guide_legend(order = 1))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

fviz_pca_var(PLN_pca_M_best,
             col.var = "contrib",
             gradient.cols = c("#00AFBB",
                               "#E7B800",
                               "#FC4E07"),
             select.var = list(contrib = 20)
)

PLN_pca_M_best$var %>%
  as.data.frame() %>%
  select(contrib.Dim.1, contrib.Dim.2) %>%
  arrange(contrib.Dim.1) %>%
  tail(n = 10) %>%
  knitr::kable()
contrib.Dim.1 contrib.Dim.2
Geobacillus 2.610976 0.0009065
Streptococcus 2.658733 0.0408484
endosymbionts 3.067242 13.4215891
Enterococcus 3.069845 0.0073316
Solibacillus 3.111592 0.0069872
Fictibacillus 3.485013 26.3546178
Staphylococcus 3.842718 1.6260811
Bacillus 4.732656 1.3751339
Candidatus Baumannia 7.879148 4.6620008
Halomonas 42.043490 10.9909085
fviz_contrib(PLN_pca_M_best, choice = "var", axes = 1:2, top = 10)

7.2 Network

7.2.1 Modeling

networks_M <- PLNnetwork(formula = data_M$Abundance ~ 1 +
                           Ecological.gradient  +
                           offset(log(Offset)), data = data_M)
## 
##  Initialization...
##  Adjusting 30 PLN with sparse inverse covariance estimation
##  Joint optimization alternating gradient descent and graphical-lasso
##  sparsifying penalty = 41.36325  sparsifying penalty = 38.20603  sparsifying penalty = 35.2898   sparsifying penalty = 32.59616  sparsifying penalty = 30.10812  sparsifying penalty = 27.80999  sparsifying penalty = 25.68728  sparsifying penalty = 23.72659  sparsifying penalty = 21.91556  sparsifying penalty = 20.24276  sparsifying penalty = 18.69765  sparsifying penalty = 17.27048  sparsifying penalty = 15.95224  sparsifying penalty = 14.73462  sparsifying penalty = 13.60993  sparsifying penalty = 12.5711   sparsifying penalty = 11.61156  sparsifying penalty = 10.72526  sparsifying penalty = 9.906609  sparsifying penalty = 9.150446  sparsifying penalty = 8.452     sparsifying penalty = 7.806866  sparsifying penalty = 7.210975  sparsifying penalty = 6.660567  sparsifying penalty = 6.152172  sparsifying penalty = 5.682582  sparsifying penalty = 5.248835  sparsifying penalty = 4.848196  sparsifying penalty = 4.478137  sparsifying penalty = 4.136325 
##  Post-treatments
##  DONE!
plot(networks_M, "diagnostic")

coefficient_path(networks_M, corr = TRUE) %>%
  ggplot(aes(x = Penalty, y = Coeff, group = Edge, colour = Edge)) +
  geom_line(show.legend = FALSE) +
  coord_trans(x = "log10") +
  theme_bw()

model_StARS <- getBestModel(networks_M, "StARS")
## 
## Stability Selection for PLNnetwork: 
## subsampling: ++++++++++++++++++++
plot(networks_M, "stability")

data.frame(
  fitted   = as.vector(fitted(model_StARS)),
  observed = as.vector(count_f2)
) %>%
  ggplot(aes(x = observed, y = fitted)) +
  geom_point(size = .5, alpha = .25) +
  scale_x_log10(limits = c(1, 4000)) +
  scale_y_log10(limits = c(1, 4000)) +
  theme_bw() +
  annotation_logticks()

7.3 Visualisation

networks_M_2 <- getBestModel(networks_M, crit = "StARS", stability = 0.9)

my_graph <- plot(networks_M_2, plot = F)
Isolated <- which(igraph::degree(my_graph) == 0)
my_graph_cleaned <- igraph::delete_vertices(my_graph, Isolated)
# my_graph_cleaned <- delete_vertices(my_graph, c("Francisella_8", "Candidatus Midichloria_11"))

my_graph <- my_graph_cleaned

vertices_names <- V(my_graph)$name
taxo <- tax_table(physeq)

result <- merged_metagenomes@tax_table %>% 
  as.data.frame() %>%
  filter(Genus %in% vertices_names) %>%
  select(Genus, Family)

# Extract unique families
unique_families <- unique(result$Family)
# Generate distinct colors for each family
n_colors <- length(unique_families)
colors <- khroma::colour("smooth rainbow")(n_colors)
# Create a named vector to map each family to a color
family_colors <- setNames(colors, unique_families)
# Add the color information to the result dataframe
result <- result %>%
  mutate(Color = family_colors[Family])

# extract abundance OTU
otu_table_df <- as.data.frame(otu_table(physeq))
abundance <- otu_table_df %>%
  mutate(Abundance = rowSums(.)) %>%
  rownames_to_column("vertices_left")
abundance_solo <- abundance %>%
  as.data.frame() %>%
  filter(vertices_left %in% vertices_names) %>%
  select(Abundance)

# # Convert the Abundance column to a numeric vector
# vertex_sizes <- as.numeric(abundance_solo$Abundance)
# # Scale the vertex sizes for better visibility
# vertex_sizes <- (vertex_sizes / max(vertex_sizes)) * 10  # Adjust the multiplier as needed
# Assign colors to the vertices
V(my_graph)$color <- result$Color
# Assign sizes to the vertices
# V(my_graph)$size <- vertex_sizes
# Choose a layout that spreads out the vertices
layout <- igraph::layout_in_circle(my_graph)

# Define unique colors and families for the legend
unique_colors <- unique(result$Color)
unique_families <- unique(result$Family)

# Define edge width for the legend
edge_widths <- E(my_graph)$width
unique_edge_widths <- unique(edge_widths)

# Edge - Arc colors
weight <- as.numeric(E(my_graph)$weight)
color_palette <- colorRampPalette(c("yellow", "red"))(200)
normalized_weights <- (weight - min(weight)) / (max(weight) - min(weight))
edge_colors <- sapply(seq_along(normalized_weights), function(i) {
  if (i == 97) {
    "#009fff"
  } else {
    color_palette[as.integer(normalized_weights[i] * (length(color_palette) - 1)) + 1]
  }
})
edge_attr(my_graph)$color <- edge_colors

E(my_graph)$color <- edge_colors

plot(simplify(my_graph),
     layout = layout,
     vertex.size = (V(my_graph)$size) * 2,
     vertex.label.dist = 1,
     vertex.label.cex = 0.5,
     edge.width = E(my_graph)$width,
     edge.color = E(my_graph)$color,
     margin = 0.2,
     main = "Network based on the GLM ~ month",
     sub = "Stability threashold 0.90 - Verticles colored by families, Edges colored by weight",
     rescale = T
)
legend("left",
       legend = unique_families,
       col = unique_colors,
       pch = 19,
       title = "Families",
       pt.cex = 2,
       cex = 0.5,
       bty = "n",
       y.intersp = 0.6,
       inset = 1 / 10
)

edgelist = as_edgelist(my_graph)

order_species <- c("Corynebacterium_3", "Corynebacterium_8",  "Fusobacterium_2", "Corynebacterium_3", "Corynebacterium_4", "Francisella_10", "Francisella_9", "Corynebacterium_6", "Mycobacterium_2", "Rhodococcus_1" , "Williamsia_1" , "Williamsia_2",  "Nocardioides_1" , "Sphingomonas_1", "Streptococcus_1", "Multi-affiliation_1", "Staphylococcus_3", "Staphylococcus_4","Staphylococcus_6" , "Peptoniphilus_1", "Candidatus Midichloria_2", "Candidatus Midichloria_6", "Candidatus Midichloria_7", "Candidatus Midichloria_3", "Rickettsia_3", "Candidatus Midichloria_4", "Candidatus Midichloria_5", "Rickettsia_1", "Rickettsia_11", "Rickettsia_2", "Rickettsia_6","Rickettsia_10", "Candidatus Midichloria_9","Rickettsia_7", "Rickettsia_7")


result <- result %>%  mutate(size = vertex_attr(my_graph)$size)
result_ordered <- result[match(order_species, result$Genus), ]

# Set the background color to black using par and text parameters
par(bg = "black", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")
# plot arc diagram
arcplot(edgelist, 
        cex.labels = 0.7,
        sorted = FALSE,
        show.nodes = TRUE,
        bg.nodes = result_ordered$Color,
        cex.nodes = (result_ordered$size) * 0.8, 
        pch.nodes = 21, 
        lwd.nodes = 0.2, 
        line = 0,
        lwd.arcs = 20 * (edge_attr(my_graph)$weight),
        col.arcs = edge_attr(my_graph)$color,
        col.labels = "lightgrey", # Set font color for labels
        col.nodes = "black",  # Set font color for node labels
        bg = NA # Use NA to keep the par bg color
        #        above = c(1:97,99:114)
)
legend("topright",
       legend = unique_families,
       col = unique_colors,
       pch = 19,
       title = "Families",
       pt.cex = 2,
       cex = 0.7,
       text.font = 3, # Set font to Times New Roman (serif)
       text.col = "lightgrey", # Set font color to light grey
       bty = "n",
       y.intersp = 0.45,
       inset = c(-0.6, -0.02))

my_graph_metaN <- c_net_update(my_graph)
## a 'v_group'-'v_class' should not match multiple colors, update 'color'.
## a 'e_type' should not match multiple colors, update 'color'.
V(my_graph_metaN)$v_group <- result$Family
V(my_graph_metaN)$color <- V(my_graph)$color
V(my_graph_metaN)$size <- vertex_attr(my_graph)$size
V(my_graph_metaN)$v_class <- result$Family
E(my_graph_metaN)$color <- E(my_graph)$color

# Set the background color to black using par and text parameters
par(bg = "white", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")

c_net_plot(my_graph_metaN,
           vertex.color = V(my_graph_metaN)$color, 
           vertex.size = ((V(my_graph_metaN)$size)*1), 
           vertex.label.color = "black",
           vertex.label = V(my_graph_metaN)$name, 
           vertex.label.cex = 0.9,
           edge_width_range = c(0.5, 3), edge.color = "#AA0000", edge.curved = 0.4,
           legend = F, legend_number = F, 
           edge_legend_title = "Weight",
           size_legend = F, size_legend_title = "Abundance",
           width_legend = T, width_legend_title = "Weight", 
           main = "Male Network"
)
# vertex_attr(my_graph_metaN)
# vertex_attr(my_graph)
legend("right",
       legend = unique_families,
       col = unique_colors,
       # title = "Families",
       pch = 19,
       pt.cex = 2,
       cex = 0.6,
       text.font = 3,
       bty = "n",
       y.intersp = 0.4,
       inset = 8 / 10
)
legend_x <- 0.85
legend_y <- 0.9 
text(x = legend_x - 2.5, y = legend_x - 0.35, labels = "Families", pos = 1, cex = 0.8, font = 4)

future::plan("sequential")